ROC for MajorityVoteClassifier

from sklearn.metrics import roc_curve

from sklearn.metrics import auc

colors=['black', 'orange', 'blue', 'green']

linestyles=[':', '--', '-.', '-']

for clf, label, clr, ls in zip(all_clf, clf_labels, colors, linestyles):

y_pred=clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]

fpr, tpr, thresholds=roc_curve(y_true=y_test, y_score=y_pred)

roc_auc=auc(x=fpr, y=tpr)

plt.plot(fpr, tpr, color=clr, linestyle=ls, label='%s (auc=%0.2f)' %(label, roc_auc))

plt.legend(loc='lower right')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)

plt.xlim([-0.1, 1.1])

plt.ylim([-0.1, 1.1])

plt.grid(alpha=0.5)

plt.xlabel('False positive rate (FPR)')

plt.ylabel('True positive rate (TPR)')

plt.show()

결정 경계

다른 모델과 같은 스케일로 나타내기 위해 결정트리 또한 표준화 처리

sc=StandardScaler()

X_train_std=sc.fit_transform(X_train)

from itertools import product

x_min=X_train_std[:, 0].min()-1

x_max=X_train_std[:, 0].max()+1

y_min=X_train_std[:, 1].min()-1

y_max=X_train_std[:, 1].max()+1

xx, yy=np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

f, axarr=plt.subplots(nrows=2, ncols=2, sharex='col', sharey='row', figsize=(8, 5))

for idx, clf, tt in zip(product([0, 1], [0, 1]), all_clf, clf_labels):

clf.fit(X_train_std, y_train)

Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])

Z=Z.reshape(xx.shape)

axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3)

axarr[idx[0], idx[1]].scatter(X_train_std[y_train==0, 0], X_train_std[y_train==0, 1], c='blue', marker='^', s=50)

axarr[idx[0], idx[1]].scatter(X_train_std[y_train==1, 0], X_train_std[y_train==1, 1], c='green', marker='o', s=50)

axarr[idx[0], idx[1]].set_title(tt)

plt.text(-3.5, -4.5, s='Sepal width [standardized]', ha='center', va='center', fontsize=12)

plt.text(-10.5, 4.5, s='Petal length [standardized]', ha='center', va='center', fontsize=12, rotation=90)

plt.show()

mv_clf.get_params()

{'pipeline-1': Pipeline(steps=[('sc', StandardScaler()),

['clf', LogisticRegression(C=0.001, random_state=1)]]), 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0), 'pipeline-2': Pipeline(steps=[('sc', StandardScaler()),

['clf', KNeighborsClassifier(n_neighbors=1)]]), 'pipeline-1__memory': None, 'pipeline-1__steps': [('sc', StandardScaler()), ['clf', LogisticRegression(C=0.001, random_state=1)]], 'pipeline-1__verbose': False, 'pipeline-1__sc': StandardScaler(), 'pipeline-1__clf': LogisticRegression(C=0.001, random_state=1), 'pipeline-1__sc__copy': True, 'pipeline-1__sc__with_mean': True, 'pipeline-1__sc__with_std': True, 'pipeline-1__clf__C': 0.001, 'pipeline-1__clf__class_weight': None, 'pipeline-1__clf__dual': False, 'pipeline-1__clf__fit_intercept': True, 'pipeline-1__clf__intercept_scaling': 1, 'pipeline-1__clf__l1_ratio': None, 'pipeline-1__clf__max_iter': 100, 'pipeline-1__clf__multi_class': 'auto', 'pipeline-1__clf__n_jobs': None, 'pipeline-1__clf__penalty': 'l2', 'pipeline-1__clf__random_state': 1, 'pipeline-1__clf__solver': 'lbfgs', 'pipeline-1__clf__tol': 0.0001, 'pipeline-1__clf__verbose': 0, 'pipeline-1__clf__warm_start': False, 'decisiontreeclassifier__ccp_alpha': 0.0, 'decisiontreeclassifier__class_weight': None, 'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 1, 'decisiontreeclassifier__max_features': None, 'decisiontreeclassifier__max_leaf_nodes': None, 'decisiontreeclassifier__min_impurity_decrease': 0.0, 'decisiontreeclassifier__min_impurity_split': None, 'decisiontreeclassifier__min_samples_leaf': 1, 'decisiontreeclassifier__min_samples_split': 2, 'decisiontreeclassifier__min_weight_fraction_leaf': 0.0, 'decisiontreeclassifier__random_state': 0, 'decisiontreeclassifier__splitter': 'best', 'pipeline-2__memory': None, 'pipeline-2__steps': [('sc', StandardScaler()), ['clf', KNeighborsClassifier(n_neighbors=1)]], 'pipeline-2__verbose': False, 'pipeline-2__sc': StandardScaler(), 'pipeline-2__clf': KNeighborsClassifier(n_neighbors=1), 'pipeline-2__sc__copy': True, 'pipeline-2__sc__with_mean': True, 'pipeline-2__sc__with_std': True, 'pipeline-2__clf__algorithm': 'auto', 'pipeline-2__clf__leaf_size': 30, 'pipeline-2__clf__metric': 'minkowski', 'pipeline-2__clf__metric_params': None, 'pipeline-2__clf__n_jobs': None, 'pipeline-2__clf__n_neighbors': 1, 'pipeline-2__clf__p': 2, 'pipeline-2__clf__weights': 'uniform'}

get_parmas() 메서드를 통해서 반환되는 값은 개별 분류기의 속성에 접근하는 방법을 상세히 알려준다.

from sklearn.model_selection import GridSearchCV

params={'decisiontreeclassifier__max_depth':[1, 2], 'pipeline=1__clf__C':[0.01, 0.1, 100.0]}

grid=GridSearchCV(estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc')

grid.fit(X_train, y_train)

코드 오류

for params, mean_score, scores in grid.grid_scores_:

print('%0.3f+/-%0.2f %r' %(mean_score, scores.std()/2, params))

코드 오류

스태킹(stacking)을 사용한 앙상블

스태킹은 두 개의 층을 가진 앙상블이다.

첫 번째 층의 개별 분류기 예측이 두 번째 층으로 주입된다.

두 번째 층에서 또 다른 분류기(전형적으로 로지스틱 회귀)가 최종 예측을 만들기 위해 첫 번째 층의 예측을 사용하여 학습된다.

사이킷런 0.22번전에서 StackingClassifier와 StackingRegressor가 추가 되었다.

stockingClassifier에 그리드 서치를 적용

from sklearn.ensemble import StackingClassifier

stack=StackingClassifier(estimators=[('lr', pipe1), ('dt', clf2), ('knn', pipe3)], final_estimator=LogisticRegression())

params={'dt__max_depth':[1, 2], 'lr__clf__C':[0.001, 0.1, 100.0]}

grid=GridSearchCV(estimator=stack, param_grid=params, cv=10, scoring='roc_auc')

grid.fit(X_train, y_train)

for r, _ in enumerate(grid.cv_results_['mean_test_score']):

print("%0.3f +/- %0.2f %r" %(grid.cv_results_['mean_test_score'][r], grid.cv_results_['std_test_score'][r]/2.0, grid.cv_results_['params'][r]))

0.950 +/- 0.07 {'dt__max_depth': 1, 'lr__clf__C': 0.001}

0.983 +/- 0.02 {'dt__max_depth': 1, 'lr__clf__C': 0.1}

0.967 +/- 0.05 {'dt__max_depth': 1, 'lr__clf__C': 100.0}

0.950 +/- 0.07 {'dt__max_depth': 2, 'lr__clf__C': 0.001}

0.983 +/- 0.02 {'dt__max_depth': 2, 'lr__clf__C': 0.1}

0.967 +/- 0.05 {'dt__max_depth': 2, 'lr__clf__C': 100.0}

Ensemble evaluate & tunning